#Import Libraries
#matplotlib
import matplotlib.pyplot as plt
#data manipulation
import numpy as np
import pandas as pd
#altair
import altair as alt
#seaborn
import seaborn as sns
#plotnine
from plotnine import *
#plotly
import plotly.graph_objs as go
import plotly as py
import plotly.express as px
import plotly.io as pio
pio.templates.default = "seaborn"
#enable offline mode
py.offline.init_notebook_mode()
#extern imports
import ipynb.fs.full.utils as utils
#import path
from pathlib import Path
#activate render mode for altair
alt.renderers.enable('default')
#define current path and join folder data
CURR_PATH = Path.cwd().joinpath('data', 'v1_4')
filename = Path(CURR_PATH , 'students_data').with_suffix('.csv')
#import dataframe
students_df = pd.read_csv(filename, sep=';')
#select graduate students and drop duplicate values
graduate_students = students_df[(students_df.Student_Label=="Graduate")&(students_df.Program_ID==143)].drop_duplicates()
dropout_students = students_df[(students_df.Student_Label=="Dropout")&(students_df.Program_ID==143)].drop_duplicates()
#format graduation semester
graduate_students["Graduation_Semester"]= np.array(utils.formatSemester(graduate_students,"Graduation_Semester"))
graduate_students["Start_Semester"]= np.array(utils.formatSemester(graduate_students,"Start_Semester"))
#grouped_MI.rename(columns = {0:'Count'}, inplace=True)
graduate_students.Duration_Start_Graduation.unique()
#remove outlier values
graduate_students = graduate_students[(graduate_students.Duration_Start_Graduation != 1) & (graduate_students.Duration_Start_Graduation != 3)]
fig, ax = plt.subplots(figsize=(10,5))
for c, df in graduate_students.groupby('Graduation_Grade'):
ax.scatter(df['Duration_Start_Graduation'], df['Graduation_Semester'],label=c)
ax.legend()
ax.set_title('Anzahl Semester vs Abschlussnote')
ax.set_xlabel('Anzahl Semester')
ax.set_ylabel('Jahr')
g = sns.FacetGrid(graduate_students, hue='Graduation_Grade',height=6, aspect=1.5
).map(plt.scatter, 'Duration_Start_Graduation',
'Graduation_Semester'
).add_legend().set(title='Anzahl Semester vs Abschlussnote'
, xlabel='Anzahl Semester', ylabel='Jahr')
(ggplot(graduate_students) +
aes(x = 'Duration_Start_Graduation',y = 'Graduation_Semester', color = 'Graduation_Grade') +
geom_point() +
ggtitle('Anzahl Semester vs Abschlussnote') +
xlab('Anzahl Semester') +
ylab('Jahr'))
traces = []
graduate_students.sort_values("Graduation_Semester", inplace=True)
for g in graduate_students['Graduation_Grade'].unique():
traces.append(
go.Scatter(
mode='markers',
x=graduate_students.Duration_Start_Graduation[graduate_students['Graduation_Grade'] == g],
y=graduate_students.Graduation_Semester[graduate_students['Graduation_Grade'] == g],
name= g))
fig = go.Figure(
layout=dict(
width=700,
title='Anzahl Semester vs Abschlussnote',
xaxis={'title': 'Anzahl Semester'},
yaxis={'title': 'Jahr'},
),
data=traces
)
fig.show()
alt.Chart(graduate_students,
title='Anzahl Semester vs Abschlussnote'
).mark_circle().encode(alt.X('Duration_Start_Graduation',
title='Anzahl Semester'
), alt.Y('Graduation_Semester',
title='Jahr'),
color='Graduation_Grade')
fig = px.box(graduate_students, y="Student_ID", x="Duration_Start_Graduation", color="Graduation_Grade",
title="Abschlussnote vs Dauer des Studiums")
fig.update_layout(
width=1000,
height=500,
xaxis= dict(
title="Dauer des Studiums (Semester) ",
nticks=20
),
yaxis= dict(
title="Studentenanzahl"
),
legend_title_text='Abschlussnote',
)
fig.show()
fig = px.violin(dropout_students, y="Duration_Min_Max_Exam", box=True, points='all', color="Gender")
fig.update_layout(
xaxis= dict(
),
yaxis= dict(
nticks=30,
title="Semesteranzahl"
),
legend_title_text='Geschlecht'
)
fig.show()
ds = dropout_students[['Gender', 'Duration_Min_Max_Exam']]
fig = px.parallel_categories(ds,
color="Duration_Min_Max_Exam",
title="Anzahl der Semester zum Abbruch",
labels={"Duration_Min_Max_Exam": "Dauer","Gender": "Geschlecht" },
color_continuous_scale=px.colors.diverging.Tealrose)
fig.show()